# -*- coding: utf-8 -*-
import scrapy
import random
import time
from douban.settings import IP_LIST_FILE
from utils.re_douban import ip_re, port_re,proxy_type_re


class XiciSpider(scrapy.Spider):
    name = 'xici'
    allowed_domains = ['www.xicidaili.com']

    # start_urls = ['http://https://www.xicidaili.com/nn//']

    def start_requests(self):
        """
        每隔1-2分钟爬取西刺代理一次
        :return:
        """
        start_url = "https://www.xicidaili.com/nn/5"

        yield scrapy.Request(url=start_url,callback=self.parse)

        # 随机睡眠
        # time.sleep(random.randint(60,180))

    def parse(self, response):
        # 循环遍历信息
        with open(IP_LIST_FILE,'w') as f:
            for label in response.css('tr.odd'):
                str = label.extract()
                ip = ip_re(str)
                port = port_re(str)
                proxy_type = proxy_type_re(str)

                # 把ip port 提取出来 并且保证是对的才ok
                if ip and port and proxy_type:
                    print(ip, port)
                    # http://host1:port
                    tmp_str = "{}://{}:{}".format(proxy_type.lower(),ip,port)
                    f.write(tmp_str+'\n')

        print('<------------------->')
        start_url = "https://www.xicidaili.com/nn/"
        #yield scrapy.Request(url=start_url,callback=self.parse,dont_filter=True)
